Predict whether an aircraft strike with wildlife causes damage
This time I’m trying an XGBoost model.
library(tidyverse)
library(tidymodels)
library(scales)
library(skimr)
library(kableExtra)
library(patchwork)
library(gt)
library(lubridate)
library(glue)
library(themis)# colours
#pal_jo <- c(viridisLite::magma(8)[2:8], "#4C4C53", "#9B9BA8")
pal_sliced <- c(
'#e946a5', # pink
'#68e199', # green
'#302df0', # blue
'#6a45b0', # purple
'#129875', # darker green
'#4f94bf', # lighter blue
'#b545ab', # darker pink
'#000000', # black,
'#cccccc' #grey
)
title_colour <- pal_sliced[1]
table_colour <- pal_sliced[4]
bar_colour <- pal_sliced[2]
grid_colour <- pal_sliced[9]
theme_set(theme_bw() %+replace%
theme(
# align title and caption to the plot not the panel
plot.title.position = 'plot',
plot.caption.position = 'plot',
# change the title and caption to markdown and move them futher from the plot
plot.title = element_text(
size = rel(1.3),
hjust = 0,
margin = margin(c(0, 0, 10, 0)),
colour = title_colour
),
plot.subtitle = element_text(
size = rel(1.15),
hjust = 0,
margin = margin(c(0, 0, 15, 0))
),
plot.caption = element_text(
hjust = 1,
margin = margin(c(10, 0, 0, 0))
),
# move axis titles to the left/top and change them to markdown
axis.title = element_text(hjust = 1),
# allow the axis values to the markdown as well
axis.text = element_text(),
# remove the panel border
panel.border = element_blank(),
# put in the axis lines with a slightly thicker line than the gridlines
axis.line = element_line(colour = grid_colour, size = rel(1.5)),
# make the tickmarks the same colour
axis.ticks = element_line(colour = grid_colour),
# facet strip text left aligned with extra space above
strip.text = element_text(
hjust = 0, margin = margin(c(10, 0, 0, 0)), colour = title_colour
),
# clear colour and fill for strip
strip.background = element_rect(colour = NA, fill = NA),
# dotted gridlines
panel.grid = element_line(linetype = 'dotted'),
# ability to use a different colour for the gridlines
panel.grid.major.x = element_line(colour = grid_colour),
panel.grid.major.y = element_line(colour = grid_colour),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
)
)
scale_y_pct <- function(
accuracy = 1L,
breaks = pretty_breaks(),
expand = expansion(mult = c(0, .05)),
...
) {
scale_y_continuous(
labels = scales::percent_format(accuracy = accuracy, big.mark = ","),
breaks = breaks,
expand = expand,
...
)
}
scale_y_comma <- function(
accuracy = 1L,
breaks = pretty_breaks(),
expand = expansion(mult = c(0, .05)),
...
) {
scale_y_continuous(
labels = scales::comma_format(accuracy = accuracy),
breaks = breaks,
expand = expand,
...
)
}
scale_fill_jo <- function(...) {
scale_fill_manual(values = pal_sliced, ...)
}
scale_fill_discrete <- scale_fill_jo
update_geom_defaults("bar", list(fill = bar_colour))
update_geom_defaults("col", list(fill = bar_colour))
update_geom_defaults("point", list(colour = bar_colour))
update_geom_defaults("line", list(colour = bar_colour))to_build <- read_csv("s01e02/train.csv", guess_max = 200000)
── Column specification ────────────────────────────────────────────────────────
cols(
.default = col_character(),
id = col_double(),
incident_year = col_double(),
incident_month = col_double(),
incident_day = col_double(),
aircraft_model = col_double(),
aircraft_mass = col_double(),
engine_make = col_double(),
engines = col_double(),
engine1_position = col_double(),
engine2_position = col_double(),
engine4_position = col_double(),
height = col_double(),
speed = col_double(),
distance = col_double(),
damaged = col_double()
)
ℹ Use `spec()` for the full column specifications.
to_score <- read_csv("s01e02/test.csv", guess_max = 200000)
── Column specification ────────────────────────────────────────────────────────
cols(
.default = col_character(),
id = col_double(),
incident_year = col_double(),
incident_month = col_double(),
incident_day = col_double(),
aircraft_model = col_double(),
aircraft_mass = col_double(),
engine_make = col_double(),
engines = col_double(),
engine1_position = col_double(),
engine2_position = col_double(),
engine4_position = col_double(),
height = col_double(),
speed = col_double(),
distance = col_double()
)
ℹ Use `spec()` for the full column specifications.
skim(to_build)| Name | to_build |
| Number of rows | 20766 |
| Number of columns | 34 |
| _______________________ | |
| Column type frequency: | |
| character | 19 |
| numeric | 15 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| operator_id | 0 | 1.00 | 3 | 5 | 0 | 275 | 0 |
| operator | 0 | 1.00 | 3 | 33 | 0 | 274 | 0 |
| aircraft | 0 | 1.00 | 3 | 20 | 0 | 424 | 0 |
| aircraft_type | 4944 | 0.76 | 1 | 1 | 0 | 2 | 0 |
| aircraft_make | 5182 | 0.75 | 2 | 3 | 0 | 62 | 0 |
| engine_model | 6275 | 0.70 | 1 | 2 | 0 | 39 | 0 |
| engine_type | 5652 | 0.73 | 1 | 3 | 0 | 8 | 0 |
| engine3_position | 19453 | 0.06 | 1 | 11 | 0 | 4 | 0 |
| airport_id | 0 | 1.00 | 3 | 5 | 0 | 1033 | 0 |
| airport | 33 | 1.00 | 4 | 53 | 0 | 1032 | 0 |
| state | 2633 | 0.87 | 2 | 2 | 0 | 60 | 0 |
| faa_region | 2241 | 0.89 | 3 | 3 | 0 | 14 | 0 |
| flight_phase | 6662 | 0.68 | 4 | 12 | 0 | 12 | 0 |
| visibility | 7625 | 0.63 | 3 | 7 | 0 | 5 | 0 |
| precipitation | 10233 | 0.51 | 3 | 15 | 0 | 8 | 0 |
| species_id | 1 | 1.00 | 1 | 6 | 0 | 446 | 0 |
| species_name | 8 | 1.00 | 4 | 50 | 0 | 444 | 0 |
| species_quantity | 530 | 0.97 | 1 | 8 | 0 | 4 | 0 |
| flight_impact | 8855 | 0.57 | 4 | 21 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 14979.38 | 8661.29 | 1 | 7456.25 | 14979.5 | 22466.75 | 30000 | ▇▇▇▇▇ |
| incident_year | 0 | 1.00 | 2006.06 | 6.72 | 1990 | 2001.00 | 2007.0 | 2012.00 | 2015 | ▂▃▅▆▇ |
| incident_month | 0 | 1.00 | 7.19 | 2.80 | 1 | 5.00 | 8.0 | 9.00 | 12 | ▃▅▆▇▆ |
| incident_day | 0 | 1.00 | 15.63 | 8.82 | 1 | 8.00 | 15.0 | 23.00 | 31 | ▇▇▇▇▆ |
| aircraft_model | 6196 | 0.70 | 24.66 | 21.72 | 0 | 10.00 | 22.0 | 37.00 | 98 | ▇▆▂▁▁ |
| aircraft_mass | 5643 | 0.73 | 3.50 | 0.89 | 1 | 3.00 | 4.0 | 4.00 | 5 | ▁▁▂▇▁ |
| engine_make | 6098 | 0.71 | 21.21 | 11.04 | 1 | 10.00 | 22.0 | 34.00 | 47 | ▇▂▆▇▁ |
| engines | 5645 | 0.73 | 2.05 | 0.46 | 1 | 2.00 | 2.0 | 2.00 | 4 | ▁▇▁▁▁ |
| engine1_position | 5786 | 0.72 | 2.99 | 2.09 | 1 | 1.00 | 1.0 | 5.00 | 7 | ▇▁▂▅▁ |
| engine2_position | 6713 | 0.68 | 2.91 | 2.01 | 1 | 1.00 | 1.0 | 5.00 | 7 | ▇▁▂▅▁ |
| engine4_position | 20419 | 0.02 | 2.01 | 1.43 | 1 | 1.00 | 1.0 | 4.00 | 5 | ▇▁▁▃▁ |
| height | 8385 | 0.60 | 820.60 | 1774.60 | 0 | 0.00 | 50.0 | 800.00 | 24000 | ▇▁▁▁▁ |
| speed | 12229 | 0.41 | 141.48 | 52.33 | 0 | 120.00 | 137.0 | 160.00 | 2500 | ▇▁▁▁▁ |
| distance | 8814 | 0.58 | 0.66 | 3.32 | 0 | 0.00 | 0.0 | 0.00 | 100 | ▇▁▁▁▁ |
| damaged | 1 | 1.00 | 0.09 | 0.28 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
Initial thoughts
explore_build <- to_build %>%
filter(! is.na(damaged)) %>%
mutate(
damaged_factor = factor(if_else(damaged == 1, 'Damaged', 'Not Damaged')),
date = ymd(paste(incident_year, incident_month, incident_day,
sep = '-')),
dow = wday(date, label = TRUE, abbr = TRUE),
log_height= log(height),
log_speed= log(speed),
log_distance = log(distance),
aircraft_mass = factor(aircraft_mass)
)set.seed(617)
explore_build %>%
select(damaged, damaged_factor, everything()) %>%
slice_sample(n = 5) %>%
gather(-id, key = 'key', value = 'value') %>%
spread(key = id, value = value) %>%
arrange(-str_detect(key, 'damag')) %>%
gt()| key | 8896 | 17349 | 20384 | 21944 | 23470 |
|---|---|---|---|---|---|
| damaged | 0 | 0 | 0 | 0 | 0 |
| damaged_factor | Not Damaged | Not Damaged | Not Damaged | Not Damaged | Not Damaged |
| aircraft | DHC8 DASH 8 | B-767-200 | UNKNOWN | UNKNOWN | C-560 |
| aircraft_make | 303 | 148 | NA | NA | 226 |
| aircraft_mass | 3 | 4 | NA | NA | 1 |
| aircraft_model | 10 | 29 | NA | NA | 45 |
| aircraft_type | A | A | NA | NA | A |
| airport | BRADLEY INTL | UNKNOWN | GEORGE BUSH INTERCONTINENTAL/ HOUSTON ARPT | YOUNGSTOWN-WARREN RGNL ARPT | UNKNOWN |
| airport_id | KBDL | ZZZZ | KIAH | KYNG | ZZZZ |
| date | 15836 | 13671 | 14762 | 14419 | 12324 |
| distance | 2 | NA | 0 | NA | NA |
| dow | Sat | Thu | Wed | Wed | Mon |
| engine_make | 31 | 22 | NA | NA | 31 |
| engine_model | 10 | 7 | NA | NA | 1 |
| engine_type | C | D | NA | NA | D |
| engine1_position | 4 | 1 | NA | NA | 5 |
| engine2_position | 4 | 1 | NA | NA | 5 |
| engine3_position | NA | NA | NA | NA | NA |
| engine4_position | NA | NA | NA | NA | NA |
| engines | 2 | 2 | NA | NA | 2 |
| faa_region | ANE | NA | ASW | AGL | NA |
| flight_impact | NONE | NONE | NA | NA | NA |
| flight_phase | APPROACH | NA | NA | TAKEOFF RUN | NA |
| height | 800 | NA | NA | NA | NA |
| incident_day | 11 | 7 | 2 | 24 | 29 |
| incident_month | 5 | 6 | 6 | 6 | 9 |
| incident_year | 2013 | 2007 | 2010 | 2009 | 2003 |
| log_distance | 0.693147180559945 | NA | -Inf | NA | NA |
| log_height | 6.68461172766793 | NA | NA | NA | NA |
| log_speed | 4.74493212836325 | NA | NA | 4.97673374242057 | NA |
| operator | COMMUTAIR | ABX AIR | UNKNOWN | MILITARY | NETJETS |
| operator_id | UCA | ABX | UNK | MIL | NJA |
| precipitation | NONE | NA | NA | NA | NA |
| species_id | UNKBS | UNKBM | I1301 | UNKBM | UNKBM |
| species_name | UNKNOWN SMALL BIRD | UNKNOWN MEDIUM BIRD | CATTLE EGRET | UNKNOWN MEDIUM BIRD | UNKNOWN MEDIUM BIRD |
| species_quantity | 1 | 1 | 1 | NA | 1 |
| speed | 115 | NA | NA | 145 | NA |
| state | CT | NA | TX | OH | NA |
| visibility | NIGHT | NA | NA | DAY | NA |
summary(explore_build$damaged) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.08553 0.00000 1.00000
ggplot(explore_build, aes(damaged_factor, fill = damaged_factor)) +
geom_bar() +
scale_y_comma() +
labs(title = 'Damaged', x = NULL, y = NULL, fill = 'Damaged') +
guides(fill = 'none')explore_num <- function(x, ...){
a <- explore_build %>%
mutate(is_it_na = if_else(is.na({{x}}), 'NA', 'Present')) %>%
ggplot(aes(is_it_na, fill = damaged_factor)) +
geom_bar() +
scale_y_comma() +
labs(x = NULL, y = NULL, fill = 'Damaged') +
guides(fill = 'none')
b <- explore_build %>%
mutate(is_it_na = if_else(is.na({{x}}), 'NA', 'Present')) %>%
ggplot(aes(is_it_na, fill = damaged_factor)) +
geom_bar(position = 'fill') +
scale_y_pct() +
labs(x = NULL, y = NULL, fill = 'Damaged') +
guides(fill = 'none')
c <- ggplot(explore_build, aes({{x}}, fill = damaged_factor)) +
geom_histogram( ...) +
scale_y_comma() +
labs(x = NULL, y = NULL, fill = 'Damaged') +
guides(fill = 'none')
d <- ggplot(explore_build, aes({{x}}, fill = damaged_factor)) +
geom_histogram(position = 'fill', ...) +
scale_y_pct() +
labs(x = NULL, y = NULL, fill = 'Damaged') +
guides(fill = 'none')
(a + b) / (c + d)
}explore_char <- function(x, ...){
c <- explore_build %>%
mutate(
tmp = fct_infreq(fct_lump_n({{x}}, 20))
) %>%
ggplot(aes(tmp, fill = damaged_factor)) +
geom_bar() +
scale_y_comma() +
scale_x_discrete(label = label_wrap(15)) +
labs(x = NULL, y = NULL, fill = 'Damaged') +
coord_flip() +
guides(fill = 'none')
d <- explore_build %>%
mutate(
tmp = fct_infreq(fct_lump_n({{x}}, 20))
) %>%
ggplot(aes(tmp, fill = damaged_factor)) +
geom_bar(position = 'fill') +
scale_y_pct() +
scale_x_discrete(label = label_wrap(15)) +
labs(x = NULL, y = NULL, fill = 'Damaged') +
coord_flip() +
guides(fill = 'none')
(c + d)
}Year the incident occurred (1990-2015)
explore_num(incident_year, binwidth = 1)Month the incident occurred (1-12)
explore_num(incident_month, binwidth = 1)Day the incident occurred (1-31)
step_date?step_holidayexplore_char(dow)International Civil Aviation Organization (ICAO) code for aircraft operators / Name of the Airline
explore_char(operator)Model of aircraft involved in the strike
For example: B-777 means a Boeing 777 was struck in the incident
explore_char(aircraft)explore_char(aircraft_type)International Civil Aviation Organization (ICAO) code for Aircraft Make
explore_char(aircraft_make)International Civil Aviation Organization (ICAO) code for Aircraft Model
explore_num(aircraft_model, binwidth = 10)1 = 2,250 kg or less
2 = 2,251-5,700 kg
3 = 5,701-27,000 kg
4 = 27,001-272,000 kg
5 = above 272,000 kg
convert to factor
explore_char(aircraft_mass)Engine Make Code
explore_num(engine_make, binwidth = 1)Engine Model Code
explore_char(engine_model)Number of engines powering the aircraft
explore_num(engines, binwidth = 1)The way the engine is powered
explore_char(engine_type)Where engine 1 is mounted on aircraft
explore_num(engine1_position)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Where engine 2 is mounted on aircraft
explore_num(engine2_position)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Where engine 3 is mounted on aircraft
Where engine 4 is mounted on aircraft
Location of the strike given as the International Civil Aviation Organization (ICAO) airport identifier / Name of the airport where the strike occurred
explore_char(airport)Abbreviated state from US, Canada
explore_char(state)FAA region where the airport is located
explore_char(faa_region)Phase of flight during which the strike occured
explore_char(flight_phase)Time of day when the strike occurred
explore_char(visibility)explore_char(precipitation)Feet About Ground Level where the strike occurred (0-24,000)
explore_num(height, bins = 50)Try log
explore_num(log_height, bins = 50)Velocity in Knots (0-2500)
explore_num(speed, bins = 50)Try log
explore_num(log_speed, bins = 50)How many miles from the airport (0-100)
explore_num(distance, binwidth = 5)Try log
explore_num(log_distance, bins = 50)International Civil Aviation Organization (ICAO) code for type of wildlife / Common name for type of wildlife struck
explore_char(species_name)Amount of wildlife struck
explore_char(species_quantity)How the strike impacted the flight
explore_char(flight_impact)set.seed(728)
split <- initial_split(to_build, strata = damaged)
train <- training(split)
test <- testing(split)set.seed(530)
train_folds <- vfold_cv(train, 5)rec <- recipe(damaged ~ ., data = train) %>%
#recipe(damaged ~ speed + height + distance + incident_year + incident_month,
# data = train) %>%
# id role
update_role(id, new_role = "ID") %>%
# damaged not missing
step_filter(! is.na(damaged)) %>%
# create variables
step_mutate(
speed = if_else(speed > 500, 500, speed),
damaged = as.character(damaged)
) %>%
# numbers to characters
step_mutate(
incident_month = as.character(incident_month),
aircraft_mass = as.character(aircraft_mass),
aircraft_model = as.character(aircraft_model),
engine_make = as.character(engine_make),
engine1_position = as.character(engine1_position),
engine2_position = as.character(engine2_position),
engines = as.character(engines)
) %>%
# remove variables
step_rm(operator_id, incident_day, species_id, airport_id,
engine3_position, engine4_position) %>%
# flag for missing values
step_indicate_na(height, speed, distance) %>%
# imputation of missing values
step_impute_mean(height, speed, distance) %>%
# character to factors
step_string2factor(all_nominal()) %>%
# explicit na
step_unknown(all_nominal_predictors()) %>%
# handle new classes
step_novel(all_nominal(), -all_outcomes()) %>%
# lump together small values
step_other(all_nominal_predictors()) %>%
# all numeric
step_dummy(all_nominal_predictors()) %>%
# missing values for factors
step_impute_lower(all_numeric_predictors()) %>%
# remove if zero variance
step_zv(all_predictors()) %>%
# rebalance
step_downsample(damaged)prepped <- prep(rec)
baked <- bake(prepped, new_data = NULL)
skim(baked)| Name | baked |
| Number of rows | 2646 |
| Number of columns | 89 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 88 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| damaged | 0 | 1 | FALSE | 2 | 0: 1323, 1: 1323 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 14697.85 | 8680.81 | 1.00 | 7009.25 | 14718.50 | 22223.75 | 30000 | ▇▇▇▇▇ |
| incident_year | 0 | 1 | 1987.62 | 158.69 | 1.66 | 2000.00 | 2006.00 | 2011.00 | 2015 | ▁▁▁▁▇ |
| height | 0 | 1 | 967.48 | 1681.51 | 0.00 | 0.00 | 817.32 | 817.32 | 17000 | ▇▁▁▁▁ |
| speed | 0 | 1 | 142.15 | 35.36 | 0.00 | 140.00 | 141.57 | 141.57 | 500 | ▁▇▁▁▁ |
| distance | 0 | 1 | 0.90 | 3.61 | 0.00 | 0.00 | 0.66 | 0.66 | 100 | ▇▁▁▁▁ |
| na_ind_height | 0 | 1 | 0.32 | 0.46 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▃ |
| na_ind_speed | 0 | 1 | 0.52 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1 | ▇▁▁▁▇ |
| na_ind_distance | 0 | 1 | 0.50 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1 | ▇▁▁▁▇ |
| incident_month_X11 | 0 | 1 | 0.08 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| incident_month_X3 | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| incident_month_X4 | 0 | 1 | 0.08 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| incident_month_X5 | 0 | 1 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| incident_month_X6 | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| incident_month_X7 | 0 | 1 | 0.10 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| incident_month_X8 | 0 | 1 | 0.12 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| incident_month_X9 | 0 | 1 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| incident_month_other | 0 | 1 | 0.14 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| operator_BUSINESS | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| operator_SOUTHWEST.AIRLINES | 0 | 1 | 0.08 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| operator_UNITED.AIRLINES | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| operator_UNKNOWN | 0 | 1 | 0.13 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| operator_other | 0 | 1 | 0.51 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1 | ▇▁▁▁▇ |
| aircraft_UNKNOWN | 0 | 1 | 0.14 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| aircraft_other | 0 | 1 | 0.80 | 0.40 | 0.00 | 1.00 | 1.00 | 1.00 | 1 | ▂▁▁▁▇ |
| aircraft_type_unknown | 0 | 1 | 0.15 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| aircraft_type_other | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| aircraft_make_X148 | 0 | 1 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▃ |
| aircraft_make_X188 | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| aircraft_make_X583 | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| aircraft_make_unknown | 0 | 1 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| aircraft_make_other | 0 | 1 | 0.35 | 0.48 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▅ |
| aircraft_model_X24 | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| aircraft_model_unknown | 0 | 1 | 0.20 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| aircraft_model_other | 0 | 1 | 0.68 | 0.47 | 0.00 | 0.00 | 1.00 | 1.00 | 1 | ▃▁▁▁▇ |
| aircraft_mass_X4 | 0 | 1 | 0.46 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▇ |
| aircraft_mass_unknown | 0 | 1 | 0.17 | 0.38 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| aircraft_mass_other | 0 | 1 | 0.20 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine_make_X22 | 0 | 1 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| engine_make_X31 | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| engine_make_X34 | 0 | 1 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine_make_unknown | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine_make_other | 0 | 1 | 0.27 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▃ |
| engine_model_X10 | 0 | 1 | 0.17 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine_model_X4 | 0 | 1 | 0.12 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| engine_model_unknown | 0 | 1 | 0.20 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine_model_other | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engines_unknown | 0 | 1 | 0.18 | 0.38 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engines_other | 0 | 1 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine_type_D | 0 | 1 | 0.61 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1 | ▅▁▁▁▇ |
| engine_type_unknown | 0 | 1 | 0.18 | 0.38 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine_type_other | 0 | 1 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| engine1_position_X4 | 0 | 1 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| engine1_position_X5 | 0 | 1 | 0.25 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▃ |
| engine1_position_unknown | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine1_position_other | 0 | 1 | 0.10 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| engine2_position_X4 | 0 | 1 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| engine2_position_X5 | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| engine2_position_unknown | 0 | 1 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▃ |
| engine2_position_other | 0 | 1 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| airport_other | 0 | 1 | 0.86 | 0.34 | 0.00 | 1.00 | 1.00 | 1.00 | 1 | ▁▁▁▁▇ |
| state_FL | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| state_NY | 0 | 1 | 0.04 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| state_TX | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| state_unknown | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| state_other | 0 | 1 | 0.57 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1 | ▆▁▁▁▇ |
| faa_region_AGL | 0 | 1 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| faa_region_ANM | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| faa_region_ASO | 0 | 1 | 0.16 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| faa_region_ASW | 0 | 1 | 0.12 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| faa_region_AWP | 0 | 1 | 0.11 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| faa_region_unknown | 0 | 1 | 0.14 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| faa_region_other | 0 | 1 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| flight_phase_CLIMB | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| flight_phase_LANDING.ROLL | 0 | 1 | 0.12 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| flight_phase_TAKEOFF.RUN | 0 | 1 | 0.14 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| flight_phase_unknown | 0 | 1 | 0.22 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| flight_phase_other | 0 | 1 | 0.08 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| visibility_NIGHT | 0 | 1 | 0.22 | 0.41 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| visibility_unknown | 0 | 1 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▃ |
| visibility_other | 0 | 1 | 0.05 | 0.22 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| precipitation_unknown | 0 | 1 | 0.41 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▆ |
| precipitation_other | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| species_name_UNKNOWN.SMALL.BIRD | 0 | 1 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| species_name_other | 0 | 1 | 0.63 | 0.48 | 0.00 | 0.00 | 1.00 | 1.00 | 1 | ▅▁▁▁▇ |
| species_quantity_X2.10 | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
| species_quantity_other | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| flight_impact_unknown | 0 | 1 | 0.33 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1 | ▇▁▁▁▃ |
| flight_impact_other | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▂ |
xgboost_spec <- boost_tree(trees = 2000, min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")xgboost_workflow <- workflow() %>%
add_recipe(rec) %>%
add_model(xgboost_spec)
xgboost_workflow══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
14 Recipe Steps
• step_filter()
• step_mutate()
• step_mutate()
• step_rm()
• step_indicate_na()
• step_impute_mean()
• step_string2factor()
• step_unknown()
• step_novel()
• step_other()
• ...
• and 4 more steps.
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)
Main Arguments:
trees = 2000
min_n = tune()
tree_depth = tune()
learn_rate = tune()
loss_reduction = tune()
sample_size = tune()
Computational engine: xgboost
set.seed(34379)
xgboost_tune <- tune_grid(
xgboost_workflow,
resamples = train_folds,
grid = 9
)show_best(xgboost_tune, metric = "roc_auc") %>%
gt()| min_n | tree_depth | learn_rate | loss_reduction | sample_size | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|---|---|---|
| 23 | 15 | 5.916736e-03 | 3.415257e-07 | 0.6487175 | roc_auc | binary | 0.8577610 | 5 | 0.008636304 | Preprocessor1_Model2 |
| 8 | 5 | 2.368746e-04 | 4.157611e-09 | 0.2803323 | roc_auc | binary | 0.8508626 | 5 | 0.009458088 | Preprocessor1_Model6 |
| 12 | 6 | 2.172910e-09 | 3.994592e-04 | 0.4225346 | roc_auc | binary | 0.8450335 | 5 | 0.009225441 | Preprocessor1_Model1 |
| 17 | 9 | 9.919712e-08 | 1.642156e-10 | 0.7587023 | roc_auc | binary | 0.8428106 | 5 | 0.007700835 | Preprocessor1_Model4 |
| 30 | 8 | 4.559401e-07 | 1.177675e+00 | 0.8194855 | roc_auc | binary | 0.8390279 | 5 | 0.008436474 | Preprocessor1_Model9 |
autoplot(xgboost_tune) +
theme_bw()final_wkflow <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "roc_auc"))
final_wkflow══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
14 Recipe Steps
• step_filter()
• step_mutate()
• step_mutate()
• step_rm()
• step_indicate_na()
• step_impute_mean()
• step_string2factor()
• step_unknown()
• step_novel()
• step_other()
• ...
• and 4 more steps.
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)
Main Arguments:
trees = 2000
min_n = 23
tree_depth = 15
learn_rate = 0.00591673615298013
loss_reduction = 3.41525742335076e-07
sample_size = 0.648717504832894
Computational engine: xgboost
xgboost_fit <- last_fit(final_wkflow, split)collect_metrics(xgboost_fit) %>%
gt()| .metric | .estimator | .estimate | .config |
|---|---|---|---|
| accuracy | binary | 0.7518783 | Preprocessor1_Model1 |
| roc_auc | binary | 0.8771489 | Preprocessor1_Model1 |
# don't run the rest
knitr::knit_exit()